# SETWD ===========================
rm(list=ls())


# LIBRARIES ===========================

library(tidyverse)
library(foreign)
library(stargazer)
library(lmtest)
library(multiwayvcov)

# IMPORT DATA ===========================

dfgi <- read.csv("data/sve-08_04_24.csv",stringsAsFactors = F)

# MAIN ANALYSIS ===========================

temp <- dfgi%>%
  filter(!is.na(gini_disp))

## Descriptive Stats ----
# number of countries:
length(unique(temp$country.name))

# number of observations
length((temp$country.name))

# percent of observations eroding:
mean(temp$erosion.strict)

## TAB 1. Logit Regression Explaining Erosion ----

m1 <- glm(erosion.strict ~ gini_disp, data=temp, family=binomial)
crse1 <- coeftest(m1, cluster.vcov(m1, temp$country.name))
crse1
nobs(m1)

m2 <- glm(erosion.strict ~ gini_disp + log(gdppc), data=temp, family=binomial)
crse2 <- coeftest(m2, cluster.vcov(m2, temp$country.name))
crse2
nobs(m2)

m3 <- glm(erosion.strict ~ gini_disp + log(gdppc) + year, 
          data=temp, family=binomial)
crse3 <- coeftest(m3, cluster.vcov(m3, temp$country.name))
crse3
nobs(m3)

stargazer(crse1,crse2,crse3, title="Logit",digits=3,
          header=F,df=F,omit.stat=c("f","ser","aic","ll"),
          add.lines = list(c("Observations", nobs(crse1), nobs(crse2), nobs(crse3))),
          star.cutoffs=c(0.1,0.05,0.01,0.001),
          star.char=c("\\dagger","*","**","***"),
          notes="$^{\\dagger} p<0.1$; $^{*} p<0.05$; $^{**} p<0.01$; $^{***} p<0.001$",
          covariate.labels=c("Gini","Logged GDP per capita",
                             "Year"),
          notes.append=F,table.placement="h!",
          out="tables/tab1-10_27_24.tex")

## FIG 2. Inequality and Probability of Erosion ----

gg <- ggplot(dfgi,aes(x=gini_disp,y=erosion.strict))+
  stat_smooth(method="glm",method.args = list(family = "binomial"),
              color="black")+
  theme_bw()

rib_data <- ggplot_build(gg)$data[[1]]

ggplot(dfgi,aes(x=gini_disp,y=erosion.strict))+
  stat_smooth(method="glm",method.args = list(family = "binomial"),
              color="black",se=F,linewidth=0.5)+
  geom_line(data = rib_data,aes(x = x,y = ymax),
            color="black",linetype="dotted") +  
  geom_line(data = rib_data,aes(x = x,y = ymin),
            color="black",linetype="dotted") +  
  scale_y_continuous(labels = 
                       scales::percent_format(accuracy = 1L))+
  xlab("Gini (post tax and transfer)")+
  ylab("Probability of Erosion")+
  theme_bw()

ggsave("figures/gini-pred.png",width=3.5,height=3.5)

## Country-specific predictions ----

# Sweden 2017 gini
dfgi%>%
  filter(country.name=="Sweden",year==2017)%>%
  dplyr::select(gini_disp)

# rank: 13%
mean(dfgi$gini_disp<26.4,na.rm=T)

# US 2017 gini
dfgi%>%
  filter(country.name=="United States",year==2017)%>%
  dplyr::select(gini_disp)

# rank: 60%
mean(dfgi$gini_disp<38.4,na.rm=T)

# max gini over all years: South Africa
dfgi%>%
  filter(!is.na(gini_disp))%>%
  filter(gini_disp==max(gini_disp))%>%
  dplyr::select(country.name,year,gini_disp)

# max gini in 2017: South Africa
dfgi%>%
  filter(!is.na(gini_disp))%>%
  filter(year==2017)%>%
  filter(gini_disp==max(gini_disp))%>%
  dplyr::select(country.name,year,gini_disp)

# predictions
temp <- dfgi%>%
  filter(!is.na(gini_disp)&!is.na(gdppc))

m1 <- glm(erosion.strict ~ gini_disp + log(gdppc), 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")

# predicted erosion in 2017
temp%>%
  filter(country.name%in%c("Sweden","United States","South Africa")&year==2017)%>%
  dplyr::select(country.name,pred1)



## Median Gini/GDP probabilities ----

temp <- dfgi%>%
  filter(!is.na(gini_disp)&!is.na(gdppc))

# distributions in the data of economic variables
summary(temp$gini_disp)
summary(temp$gdppc)

# create dataframe with median/25th/75th percentiles of econ variables and years
gini_disp <- rep(c(rep(median(temp$gini_disp),2),summary(temp$gini_disp)[5]),26)
gdppc <- rep(c(summary(temp$gdppc)[2],rep(median(temp$gdppc),2)),26)
year <- rep(seq(1995,2020,1),each=3)
df.pred <- data.frame(gini_disp,gdppc,year)
df.pred$pred <- NA

# generate predictions
m2 <- glm(erosion.strict ~ gini_disp + log(gdppc) + year, 
          data=temp, family="binomial")
df.pred$pred <- predict(m2, df.pred, type="response")

# summarize the change in predicted rate when gini increases
gini.effect <- df.pred%>%
  filter(gdppc>10000)%>%
  group_by(year)%>%
  summarise(gini_effect=last(pred)-first(pred))

# summarize the change in predicted rate when gdp decreases
gdp.effect <- df.pred%>%
  filter(gini_disp<40)%>%
  group_by(year)%>%
  summarise(gdp_effect=first(pred)-last(pred))

# merge data and calculate the ratio of the gini effect size/gdp effect size
df.fx <- full_join(gini.effect,gdp.effect)%>%
  mutate(ratio=gini_effect/gdp_effect)

# distribution of ratios across all years in the data: always >2
summary(df.fx$ratio)


## AUC estimate ----

auc <- function(temp){
  pos.scores <- temp%>%
    filter(!is.na(pred1))%>%
    filter(erosion.strict==1)%>%
    pull(pred1)
  
  neg.scores <- temp%>%
    filter(!is.na(pred1))%>%
    filter(erosion.strict==0)%>%
    pull(pred1)
  
  # estimate AUC
  set.seed(135102)
  reps <- 1000000
  mean(sample(pos.scores,reps,replace=T) > sample(neg.scores,reps,replace=T))
}

m1 <- glm(erosion.strict ~ gini_disp + log(gdppc) + year, 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
auc(temp)
# AUC: 0.7975

